The dataset contains data on 5000 customers. The data include customer demographic information (age, income, etc.), the customer's relationship with the bank (mortgage, securities account, etc.), and the customer response to the last personal loan campaign (Personal Loan). Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was offered to them in the earlier campaign.
This case is about a bank (Thera Bank) whose management wants to explore ways of converting its liability customers to personal loan customers (while retaining them as depositors). A campaign that the bank ran last year for liability customers showed a healthy conversion rate of over 9% success. This has encouraged the retail marketing department to devise campaigns with better target marketing to increase the success ratio with a minimal budget.
The classification goal is to predict the likelihood of a liability customer buying personal loans.
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression(solver='newton-cg')
from sklearn.model_selection import train_test_split as tts
from sklearn.impute import SimpleImputer as si
from sklearn import metrics as mtr
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
sns.set_style(style='darkgrid')
# Reading file
df = pd.read_csv('Bank_Personal_Loan_Modelling.csv')
# Renaming columns for ease of working with them
df.rename(columns={'ZIP Code':'Zipcode','Personal Loan':'AccptPersLoan','Securities Account':'HaveSecAcct','CD Account':'HaveCDAcct','Online':'HaveOnline','CreditCard':'HaveCC'},inplace=True)
#Checking data-types
df.info()
df.shape
# Checking for null values
df.isnull().sum()
#Checking for 'zero' values
df[df==0].count()
#Statistical Summary
df.describe().T
# Checking for skewness of the data
df.skew()
# Visual analysis of data skewness
df[['Age','Experience','Income','Zipcode','Family','CCAvg','Education','Mortgage']].hist(bins=25,figsize=[25,15])
# Number of unique in each column?
df.nunique()
# Number of people with zero mortgage?
print('Number of people with zero mortgage : ',df.ID[df.Mortgage == 0].count())
# Number of people with zero credit card spending per month?
print('Number of people with zero credit card spending per month : ',df.ID[df.CCAvg == 0].count())
# Value counts of all categorical columns.
for col in ['AccptPersLoan','HaveSecAcct','HaveCDAcct','HaveOnline','HaveCC','Family','Education','Experience','Zipcode'] :
print('Value Counts of {} \n{}\n\n'.format(col,df[col].value_counts()))
dfviz = df[['Age','Experience','Income','CCAvg','Mortgage','AccptPersLoan']].copy()
plt.figure(figsize=[30,20])
sns.set(font_scale=1.15)
# sns.pairplot(dfviz)
sns.pairplot(dfviz, hue='AccptPersLoan')
plt.show()
plt.figure(figsize=[20,15])
sns.set(font_scale=1.15)
sns.heatmap(df.corr(),annot=True,vmin=-1,vmax=1,center=0, linewidth=0.2, fmt='.2f')
plt.show()
corr = df.drop('AccptPersLoan',axis=1).corr()
plt.figure(figsize=[20,15])
sns.set(font_scale=1.15)
sns.heatmap(corr,annot=True,vmin=-1,vmax=1,center=0, linewidth=0.2, fmt='.2f', cmap='coolwarm')
plt.show()
# Comparing the distribution of Age in relation to acceptance of Loan
fig = px.violin(df, color='AccptPersLoan', y='Experience', width=1000)
fig.show()
fig = px.violin(df, color='AccptPersLoan', y='CCAvg', width=1000)
fig.show()
# Dropping ID column since that is an identifier and not a feature
df.drop('ID', axis=1, inplace=True)
# Dropping observations with incorrect data (with outlier 4 digit Zipcode)
df.drop(df[df.Zipcode < 90000].index, inplace=True)
# Converting zipcode to bins
ziplabels = ['900-905','905-910','910-915','915-920','920-925','925-930','930-935','935-940','940-945','945-950','950-955','955-960','960+']
df['Zipcode'] = pd.cut(df.Zipcode, bins=[90000, 90500, 91000, 91500, 92000, 92500, 93000, 93500, 94000, 94500, 95000, 95500, 96000,100000], labels=ziplabels)
df.head()
# Checking for acceptance of loans across the zipcodes
fig = px.histogram(df, x='Zipcode', color='AccptPersLoan', barmode='group',
category_orders={'Zipcode': ziplabels},
width=1000)
fig.show()
# Dropping observations with incorrect data (with Experience less than zero)
df.drop(df[df.Experience < 0 ].index, inplace=True)
# Checking distribution of CC Average
fig = px.histogram(df, x='CCAvg', marginal='rug', width=1000)
fig.show()
# Converting CC Average to bins - given it is already represented in '000s'
cclabels = ['0-1K','1-2K','2-3K','3-4K','4-5K','5-8K','8K+']
df['CCAvg'] = pd.cut(df.CCAvg, bins=[0, 1, 2, 3, 4, 5, 8,12], labels=cclabels)
df.head()
df.Family = df.Family.astype('category')
df.Education = df.Education.astype('category')
df = pd.get_dummies(df)
df.describe().T
df.info()
dfy = df[['AccptPersLoan']].copy()
dfX = df.drop('AccptPersLoan',axis=1).copy()
X_train, X_test, y_train, y_test = tts(dfX, dfy, test_size=0.3, random_state=1)
X_test.describe().T
logit.fit(X_train,y_train)
print('Training score : ',logit.score(X_train, y_train))
logit.fit(X_test,y_test)
print('Test score : ',logit.score(X_test, y_test))
y_predict = logit.predict(X_test)
cmat = mtr.confusion_matrix(y_test,y_predict, labels=[1,0])
dfcm = pd.DataFrame(cmat, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
npcm = np.array(cmat)
print('Accuracy of the model : ',npcm.trace()/npcm.sum())
print('Recall of the model : ',npcm[0,0]/npcm[:,0].sum())
print('Precision of the model : ',npcm[0,0]/npcm[0].sum())
appr0 = np.array([[npcm.trace()/npcm.sum()],[npcm[0,0]/npcm[:,0].sum()],[npcm[0,0]/npcm[0].sum()]])
appr0
df1 = df.copy()
fig = px.histogram(df1[df1.Mortgage > 0 ], x='Mortgage', marginal='rug', width=1000)
fig.show()
# Converting Mortgage to bins
mortgagelabels = ['Zero','0-100K','100-150K','150-200K','200-250K','250-300K','300-400K','400K+']
df1.Mortgage = pd.cut(df1.Mortgage, bins=[-1,0,100, 150, 200, 250, 300, 400, 1000], labels=mortgagelabels)
df1.Mortgage.value_counts(normalize=True)*100
df1 = pd.get_dummies(df1)
df1.HaveSecAcct = df1.HaveSecAcct.astype('category')
df1.HaveCDAcct = df1.HaveCDAcct.astype('category')
df1.HaveOnline = df1.HaveOnline.astype('category')
df1.HaveCC = df1.HaveCC.astype('category')
df1.info()
dfy1 = df1[['AccptPersLoan']].copy()
dfX1 = df1.drop('AccptPersLoan',axis=1).copy()
X1_train, X1_test, y1_train, y1_test = tts(dfX1, dfy1, test_size=0.3, random_state=30)
y1_train.AccptPersLoan.value_counts(normalize=True)*100
logit.fit(X1_train,y1_train)
print('Training score : ',logit.score(X1_train, y1_train))
logit.fit(X1_test,y1_test)
print('Test score : ',logit.score(X1_test, y1_test))
y1_predict = logit.predict(X1_test)
cmat1 = mtr.confusion_matrix(y1_test,y1_predict, labels=[1,0])
dfcm1 = pd.DataFrame(cmat1, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm1, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
npcm1 = np.array(cmat1)
print('Accuracy of the model : ',npcm1.trace()/npcm1.sum())
print('Recall of the model : ',npcm1[0,0]/npcm1[:,0].sum())
print('Precision of the model : ',npcm1[0,0]/npcm1[0].sum())
appr1 = np.array([[npcm1.trace()/npcm1.sum()],[npcm1[0,0]/npcm1[:,0].sum()],[npcm1[0,0]/npcm1[0].sum()]])
appr1
df2 = df1.copy()
df2['Income_Log'] = np.log(df2.Income)
fig = px.box(df2, x='Income', width=1000, height=200)
fig.show()
fig = px.box(df2, x='Income_Log', width=1000, height=200)
fig.show()
df2.drop('Income',axis=1,inplace=True)
dfy2 = df2[['AccptPersLoan']].copy()
dfX2 = df2.drop('AccptPersLoan',axis=1).copy()
X2_train, X2_test, y2_train, y2_test = tts(dfX2, dfy2, test_size=0.3, random_state=30)
y2_train.AccptPersLoan.value_counts(normalize=True)*100
logit.fit(X2_train,y2_train)
print('Training score : ',logit.score(X2_train, y2_train))
logit.fit(X2_test,y2_test)
print('Test score : ',logit.score(X2_test, y2_test))
y2_predict = logit.predict(X2_test)
cmat2 = mtr.confusion_matrix(y2_test,y2_predict, labels=[1,0])
dfcm2 = pd.DataFrame(cmat2, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm2, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
npcm2 = np.array(cmat2)
print('Accuracy of the model : ',npcm2.trace()/npcm2.sum())
print('Recall of the model : ',npcm2[0,0]/npcm2[:,0].sum())
print('Precision of the model : ',npcm2[0,0]/npcm2[0].sum())
appr2 = np.array([[npcm2.trace()/npcm2.sum()],[npcm2[0,0]/npcm2[:,0].sum()],[npcm2[0,0]/npcm2[0].sum()]])
appr2
df3 = df2.copy()
dfy3 = df3[['AccptPersLoan']].copy()
dfX3 = df3.drop('AccptPersLoan',axis=1).copy()
X3_train, X3_test, y3_train, y3_test = tts(dfX3, dfy3, test_size=0.3, random_state=30)
y3_train.AccptPersLoan.value_counts(normalize=True)*100
logit = LogisticRegression(solver='newton-cg', class_weight='balanced')
logit.fit(X3_train,y3_train)
print('Training score : ',logit.score(X3_train, y3_train))
logit.fit(X3_test,y3_test)
print('Test score : ',logit.score(X3_test, y3_test))
y3_predict = logit.predict(X3_test)
cmat3 = mtr.confusion_matrix(y3_test,y3_predict, labels=[1,0])
dfcm3 = pd.DataFrame(cmat3, index=['1','0'],columns=['1','0'])
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm3, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()
npcm3 = np.array(cmat3)
print('Accuracy of the model : ',npcm3.trace()/npcm3.sum())
print('Recall of the model : ',npcm3[0,0]/npcm3[:,0].sum())
print('Precision of the model : ',npcm3[0,0]/npcm3[0].sum())
appr3 = np.array([[npcm3.trace()/npcm3.sum()],[npcm3[0,0]/npcm3[:,0].sum()],[npcm3[0,0]/npcm3[0].sum()]])
appr3
Comparing the results of the iterations:
compare = pd.DataFrame('',index=['Accuracy','Recall','Precision'],columns=['Iteration0','Iteration1','Iteration2','Iteration3'])
compare.Iteration0 = appr0
compare.Iteration1 = appr1
compare.Iteration2 = appr2
compare.Iteration3 = appr3
pd.options.display.float_format = '{:,.4f}'.format
compare
cmpr = compare.T
cmpr = compare.reset_index()
cmpr = cmpr.melt(id_vars='index',value_vars=['Iteration0','Iteration1','Iteration2','Iteration3'], var_name='Iteration',value_name='Value')
# cmpr
fig = px.line(cmpr, x='Iteration', y='Value', color='index', range_y=[0.5,1.1], width=1000, height=600)
fig.data[0].update(mode='markers+lines')
fig.data[1].update(mode='markers+lines')
fig.data[2].update(mode='markers+lines')
fig.show()
# Attributes and Co-efficients
dfcoef = pd.DataFrame({'Attributes':list(X3_test.columns),'Coefficient':list(logit.coef_[0])})
dfcoef.sort_values('Coefficient', ascending=False)
# Showing predicted values different to the Actual values
y3_predict = logit.predict(X3_test)
dfpred = pd.DataFrame({'Actual':(list(y3_test.AccptPersLoan)),'Predicted':list(y3_predict)})
dfpred[dfpred.Actual != dfpred.Predicted]
# Summary showing predicted values different to the Actual values
dfpred[dfpred.Actual != dfpred.Predicted].groupby(['Actual','Predicted'])['Predicted'].agg('count')
dfcoef.sort_values('Coefficient', ascending=False).head()
dfcoef.sort_values('Coefficient').head()
plt.figure(figsize=[5,3])
sns.set(font_scale=1.2)
fig = sns.heatmap(dfcm3, annot=True, fmt='d',linewidth=0.5, cbar=False)
plt.tick_params(axis='both', which='major', labelbottom = False, bottom=False, top = False, labeltop=True)
plt.ylabel('Predicted\n')
plt.show()